In [8]:
import catboost
print(catboost.__version__)
!python --version
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
0.20.2
Python 3.6.5 :: Anaconda, Inc.
In [6]:
import pandas as pd
import os
import numpy as np
np.set_printoptions(precision=4)
import catboost
from catboost import *
from catboost import datasets
from sklearn.model_selection import train_test_split
from pathlib import Path
In [10]:
DIR=Path('../listing_price_suggest.csv')
DIR
df=pd.read_csv(DIR)
df.head()
Out[10]:
PosixPath('../listing_price_suggest.csv')
Out[10]:
primary_key y_var x1var_cat x2var_cat x3var_cont x4var_cont x5var_cont x6var_cont x7var_cont x8var_cont x9var_cat x10var_cont x11var_cont x12var_cat x13var_cont x14var_cont x15var_cont
0 9 80 1 2 15.9 1.3043 1.13 15.0787 0.00 0.0 0 0.5479 0.0000 0 0.0 1.5300 0.0000
1 57 0 0 1 58.3 0.0000 0.00 0.0750 0.00 0.0 1 0.6849 1.4400 0 0.0 1.5300 0.0533
2 105 0 3 1 143.1 0.0000 0.00 0.0750 0.00 0.0 0 0.1369 0.0000 0 0.0 1.5300 0.0000
3 153 37 3 1 169.6 4.3478 5.65 27.0817 76.59 0.0 0 44.5205 11.1542 0 0.0 0.3642 0.0000
4 201 0 0 2 148.4 0.0000 0.00 0.0750 0.00 0.0 0 0.0000 0.0000 0 0.0 1.5300 0.0000
In [11]:
## Split Train-Test split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,2:],df['y_var'], test_size=0.33, random_state=5)
data_dict={'X_train':X_train,'X_test':X_test,'y_train':y_train,'y_test':y_test}
In [13]:
cat_columns=df.columns[df.columns.str.contains("cat")].tolist()
con_columns=df.columns[~df.columns.str.contains("cat")].tolist()
cat_columns,con_columns
Out[13]:
(['x1var_cat', 'x2var_cat', 'x9var_cat', 'x12var_cat'],
 ['primary_key',
  'y_var',
  'x3var_cont',
  'x4var_cont',
  'x5var_cont',
  'x6var_cont',
  'x7var_cont',
  'x8var_cont',
  'x10var_cont',
  'x11var_cont',
  'x13var_cont',
  'x14var_cont',
  'x15var_cont'])

Tunning model with different parameters

In [17]:
from catboost import CatBoostRegressor
model = CatBoostRegressor(
    iterations=15,
    learning_rate=0.1,
    # loss_function='CrossEntropy'
)
model.fit(
    X_train, y_train,
    cat_features=cat_columns,
    eval_set=(X_test, y_test),
    verbose=True
)
print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())
0:	learn: 182.0475866	test: 162.3287026	best: 162.3287026 (0)	total: 103ms	remaining: 1.45s
1:	learn: 179.0079130	test: 159.8365181	best: 159.8365181 (1)	total: 165ms	remaining: 1.07s
2:	learn: 176.1690952	test: 157.4916424	best: 157.4916424 (2)	total: 200ms	remaining: 799ms
3:	learn: 173.6269355	test: 155.1383437	best: 155.1383437 (3)	total: 234ms	remaining: 643ms
4:	learn: 171.5199746	test: 153.2342240	best: 153.2342240 (4)	total: 259ms	remaining: 518ms
5:	learn: 169.3397207	test: 151.4719757	best: 151.4719757 (5)	total: 288ms	remaining: 431ms
6:	learn: 167.5173495	test: 150.1076319	best: 150.1076319 (6)	total: 321ms	remaining: 367ms
7:	learn: 165.9387398	test: 148.5865881	best: 148.5865881 (7)	total: 363ms	remaining: 317ms
8:	learn: 164.4607045	test: 147.8165920	best: 147.8165920 (8)	total: 399ms	remaining: 266ms
9:	learn: 163.0824830	test: 146.7836819	best: 146.7836819 (9)	total: 450ms	remaining: 225ms
10:	learn: 161.7539790	test: 145.9816164	best: 145.9816164 (10)	total: 503ms	remaining: 183ms
11:	learn: 160.5535172	test: 145.2700298	best: 145.2700298 (11)	total: 544ms	remaining: 136ms
12:	learn: 159.4499208	test: 144.5856219	best: 144.5856219 (12)	total: 595ms	remaining: 91.6ms
13:	learn: 158.5780698	test: 144.2910965	best: 144.2910965 (13)	total: 634ms	remaining: 45.3ms
14:	learn: 157.6678142	test: 143.8010724	best: 143.8010724 (14)	total: 669ms	remaining: 0us

bestTest = 143.8010724
bestIteration = 14

Out[17]:
<catboost.core.CatBoostRegressor at 0x7fb32b278eb8>
Model is fitted: True
Model params:
{'loss_function': 'RMSE', 'learning_rate': 0.1, 'iterations': 15}

Ploting the training time metrics to visualize the process of Model Training and getting the best score for the given model

In [21]:
model = CatBoostRegressor(
    iterations=50,
    random_seed=63,
    learning_rate=0.5,
)
model.fit(
    X_train, y_train,
    cat_features=cat_columns,
    eval_set=(X_test, y_test),
    verbose=False,
    plot=True
)
Out[21]:
<catboost.core.CatBoostRegressor at 0x7fb32b4fd550>

Changing different learning rate and training the model

In [26]:
model1 = CatBoostRegressor(
    learning_rate=0.7,
    iterations=100,
    random_seed=0,
    train_dir='learing_rate_0.7'
)

model2 = CatBoostRegressor(
    learning_rate=0.01,
    iterations=100,
    random_seed=0,
    train_dir='learing_rate_0.01'
)
model1.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    cat_features=cat_columns,
    verbose=False
)
model2.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    cat_features=cat_columns,
    verbose=False
)
Out[26]:
<catboost.core.CatBoostRegressor at 0x7fb32b51cac8>
Out[26]:
<catboost.core.CatBoostRegressor at 0x7fb32b51ca20>
In [27]:
from catboost import MetricVisualizer
MetricVisualizer(['learing_rate_0.01', 'learing_rate_0.7']).start()

Increasing the Training iteration to 1000

In [29]:
model = CatBoostRegressor(
    iterations=1000,
    random_seed=63,
    learning_rate=0.5,
#     use_best_model=False
)
model.fit(
    X_train, y_train,
    cat_features=cat_columns,
    eval_set=(X_test, y_test),
    verbose=False,
    plot=True
)
Out[29]:
<catboost.core.CatBoostRegressor at 0x7fb32b585208>

Cross Validation To Avoid Overfitting

  • Here we used 5 Fold Cross Validation
In [31]:
from catboost import cv

params = {}
params['loss_function'] = 'MAE'
params['iterations'] = 100
params['custom_loss'] = 'RMSE'
params['random_seed'] = 63
params['learning_rate'] = 0.5

cv_data = cv(
    params = params,
    pool = Pool(df.iloc[:,2:], label=df['y_var'], cat_features=cat_columns),
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    plot=True,
    stratified=False,
    verbose=False
)
In [33]:
cv_data
Out[33]:
iterations test-MAE-mean test-MAE-std train-MAE-mean train-MAE-std test-RMSE-mean test-RMSE-std train-RMSE-mean train-RMSE-std
0 0 48.893247 0.548771 48.893048 0.082923 181.629197 11.371901 182.032605 2.741145
1 1 46.174887 0.456244 46.163602 0.136518 177.555057 11.539451 177.963765 2.844985
2 2 44.631777 0.414381 44.614996 0.108849 172.612782 12.040638 173.050979 2.689868
3 3 44.113630 0.377788 44.078426 0.179333 170.710102 12.138824 171.092317 2.746240
4 4 43.814020 0.429458 43.771432 0.190768 169.334540 12.340713 169.680288 2.741719
... ... ... ... ... ... ... ... ... ...
95 95 42.960165 0.348937 42.835320 0.217596 164.722077 11.464189 165.076554 3.811292
96 96 42.960167 0.348948 42.835291 0.217589 164.721908 11.464089 165.076365 3.811389
97 97 42.960166 0.348960 42.835278 0.217595 164.721881 11.464200 165.076343 3.811312
98 98 42.960084 0.348715 42.835216 0.217628 164.721455 11.463246 165.075882 3.811969
99 99 42.960716 0.348603 42.834773 0.217733 164.721651 11.464038 165.074745 3.811438

100 rows × 9 columns

Grid Search to find Optimum Hyperparameter for the model

In [35]:
grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}
model_search = CatBoostRegressor(
    iterations=100,
    random_seed=63,
    learning_rate=0.5,
#     use_best_model=False
)
randomized_search_result = model_search.randomized_search(grid,
                                                   X=df.iloc[:,2:],
                                                   y=df['y_var'],
                                                   plot=True)
0:	loss: 150.0275554	best: 150.0275554 (0)	total: 2.36s	remaining: 21.3s
1:	loss: 150.4034040	best: 150.0275554 (0)	total: 4.26s	remaining: 17s
2:	loss: 152.3746019	best: 150.0275554 (0)	total: 6.67s	remaining: 15.6s
3:	loss: 149.7304165	best: 149.7304165 (3)	total: 8.9s	remaining: 13.4s
4:	loss: 152.4509373	best: 149.7304165 (3)	total: 11.5s	remaining: 11.5s
5:	loss: 149.7566562	best: 149.7304165 (3)	total: 13.8s	remaining: 9.21s
6:	loss: 150.5363780	best: 149.7304165 (3)	total: 16.1s	remaining: 6.92s
7:	loss: 149.7589801	best: 149.7304165 (3)	total: 21.2s	remaining: 5.3s
8:	loss: 152.0012754	best: 149.7304165 (3)	total: 25.8s	remaining: 2.87s
9:	loss: 149.6813544	best: 149.6813544 (9)	total: 31.5s	remaining: 0us
Estimating final quality...

Feature Importance

  • Finding the Important features from the model
In [36]:
model.get_feature_importance(prettified=True)
Out[36]:
Feature Id Importances
0 x6var_cont 34.392928
1 x4var_cont 15.037075
2 x8var_cont 12.997805
3 x12var_cat 6.013575
4 x5var_cont 5.832739
5 x7var_cont 5.784144
6 x11var_cont 4.910128
7 x3var_cont 3.856842
8 x9var_cat 3.337266
9 x15var_cont 2.871873
10 x14var_cont 1.541487
11 x10var_cont 1.528342
12 x13var_cont 1.516160
13 x2var_cat 0.379638
14 x1var_cat 0.000000
In [58]:
d=model.plot_tree(
    tree_idx=0,
    pool=pool1
)
In [63]:
d
Out[63]:
%3 0 x12var_cat, value=1 1 x7var_cont, value>20059.9 0->1 Yes 2 x7var_cont, value>20059.9 0->2 No 3 x6var_cont, value>114.565 1->3 Yes 4 x6var_cont, value>114.565 1->4 No 5 x6var_cont, value>114.565 2->5 Yes 6 x6var_cont, value>114.565 2->6 No 7 x8var_cont, value>0.5 3->7 Yes 8 x8var_cont, value>0.5 3->8 No 9 x8var_cont, value>0.5 4->9 Yes 10 x8var_cont, value>0.5 4->10 No 11 x8var_cont, value>0.5 5->11 Yes 12 x8var_cont, value>0.5 5->12 No 13 x8var_cont, value>0.5 6->13 Yes 14 x8var_cont, value>0.5 6->14 No 15 x4var_cont, value>0.6521 7->15 Yes 16 x4var_cont, value>0.6521 7->16 No 17 x4var_cont, value>0.6521 8->17 Yes 18 x4var_cont, value>0.6521 8->18 No 19 x4var_cont, value>0.6521 9->19 Yes 20 x4var_cont, value>0.6521 9->20 No 21 x4var_cont, value>0.6521 10->21 Yes 22 x4var_cont, value>0.6521 10->22 No 23 x4var_cont, value>0.6521 11->23 Yes 24 x4var_cont, value>0.6521 11->24 No 25 x4var_cont, value>0.6521 12->25 Yes 26 x4var_cont, value>0.6521 12->26 No 27 x4var_cont, value>0.6521 13->27 Yes 28 x4var_cont, value>0.6521 13->28 No 29 x4var_cont, value>0.6521 14->29 Yes 30 x4var_cont, value>0.6521 14->30 No 31 x6var_cont, value>725.06 15->31 Yes 32 x6var_cont, value>725.06 15->32 No 33 x6var_cont, value>725.06 16->33 Yes 34 x6var_cont, value>725.06 16->34 No 35 x6var_cont, value>725.06 17->35 Yes 36 x6var_cont, value>725.06 17->36 No 37 x6var_cont, value>725.06 18->37 Yes 38 x6var_cont, value>725.06 18->38 No 39 x6var_cont, value>725.06 19->39 Yes 40 x6var_cont, value>725.06 19->40 No 41 x6var_cont, value>725.06 20->41 Yes 42 x6var_cont, value>725.06 20->42 No 43 x6var_cont, value>725.06 21->43 Yes 44 x6var_cont, value>725.06 21->44 No 45 x6var_cont, value>725.06 22->45 Yes 46 x6var_cont, value>725.06 22->46 No 47 x6var_cont, value>725.06 23->47 Yes 48 x6var_cont, value>725.06 23->48 No 49 x6var_cont, value>725.06 24->49 Yes 50 x6var_cont, value>725.06 24->50 No 51 x6var_cont, value>725.06 25->51 Yes 52 x6var_cont, value>725.06 25->52 No 53 x6var_cont, value>725.06 26->53 Yes 54 x6var_cont, value>725.06 26->54 No 55 x6var_cont, value>725.06 27->55 Yes 56 x6var_cont, value>725.06 27->56 No 57 x6var_cont, value>725.06 28->57 Yes 58 x6var_cont, value>725.06 28->58 No 59 x6var_cont, value>725.06 29->59 Yes 60 x6var_cont, value>725.06 29->60 No 61 x6var_cont, value>725.06 30->61 Yes 62 x6var_cont, value>725.06 30->62 No 63 val = 1970.711 31->63 Yes 64 val = 491.602 31->64 No 65 val = 57.791 32->65 Yes 66 val = 57.791 32->66 No 67 val = 923.958 33->67 Yes 68 val = 94.063 33->68 No 69 val = 57.791 34->69 Yes 70 val = 57.791 34->70 No 71 val = 57.791 35->71 Yes 72 val = 67.532 35->72 No 73 val = 57.791 36->73 Yes 74 val = 57.791 36->74 No 75 val = 57.791 37->75 Yes 76 val = 69.422 37->76 No 77 val = 57.791 38->77 Yes 78 val = 57.791 38->78 No 79 val = 734.746 39->79 Yes 80 val = 259.033 39->80 No 81 val = 57.791 40->81 Yes 82 val = 407.229 40->82 No 83 val = 291.629 41->83 Yes 84 val = 126.731 41->84 No 85 val = 3326.692 42->85 Yes 86 val = 434.907 42->86 No 87 val = 57.791 43->87 Yes 88 val = 127.093 43->88 No 89 val = 57.791 44->89 Yes 90 val = 186.122 44->90 No 91 val = 57.791 45->91 Yes 92 val = 74.870 45->92 No 93 val = 57.791 46->93 Yes 94 val = 90.876 46->94 No 95 val = 497.357 47->95 Yes 96 val = 223.461 47->96 No 97 val = 57.791 48->97 Yes 98 val = 57.791 48->98 No 99 val = 155.790 49->99 Yes 100 val = 93.489 49->100 No 101 val = 57.791 50->101 Yes 102 val = 57.791 50->102 No 103 val = 57.791 51->103 Yes 104 val = 57.791 51->104 No 105 val = 57.791 52->105 Yes 106 val = 57.791 52->106 No 107 val = 57.791 53->107 Yes 108 val = 55.988 53->108 No 109 val = 57.791 54->109 Yes 110 val = 56.817 54->110 No 111 val = 415.361 55->111 Yes 112 val = 205.640 55->112 No 113 val = 838.692 56->113 Yes 114 val = 546.134 56->114 No 115 val = 225.988 57->115 Yes 116 val = 98.463 57->116 No 117 val = 57.791 58->117 Yes 118 val = 347.095 58->118 No 119 val = 57.791 59->119 Yes 120 val = 137.989 59->120 No 121 val = 57.791 60->121 Yes 122 val = 152.569 60->122 No 123 val = 57.791 61->123 Yes 124 val = 58.976 61->124 No 125 val = 57.791 62->125 Yes 126 val = 41.889 62->126 No
In [42]:
pool1 = Pool(data=df.iloc[:,2:], label=df['y_var'], cat_features=cat_columns)


shap_values = model.get_feature_importance(pool1,type='Interaction')

expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

print(shap_values.shape)

shap_values


shap.initjs()
shap.force_plot(expected_value, shap_values[3,:], df.iloc[3,:])